#Packages

#1. Load data

#2. Descriptive statistics

##2.1. Number of factor levels of variables

length(unique(db$ID))
## [1] 15253
length(unique(db$Gender))
## [1] 2
length(unique(db$Inst_HUN))
## [1] 52
length(unique(db$Pr_area_ENG))
## [1] 32
length(unique(db$Pr_HUN))
## [1] 398
length(unique(db$Pr_ENG_bachelor))
## [1] 111
length(unique(db$Grad_level))
## [1] 7
length(unique(db$FEOR4))
## [1] 402
length(unique(db$FEOR3))
## [1] 109
length(unique(db$ISCO1))
## [1] 9
length(unique(db$ISCO3))
## [1] 115
length(unique(db$Req_HEd))
## [1] 2
length(unique(db$Head_couty_HUN))
## [1] 0
length(unique(db$Weekly_hrs))
## [1] 49
length(unique(db$Mountly_wage_HUF))
## [1] 7769

##2.2. Number of graduates in different groups

## [1] 15253
## 
##    A    B    E    F    M    O    P 
##  921 7402 3252 1889 1334   46  409
## [1] 45
## [1] 15
## [1] 110
## [1] 372
## [1] 106

##2.3 Distribution of graduates working in occupation category that requires higher education degree

##    
##              0          1
##   A 0.61129207 0.38870793
##   B 0.31234801 0.68765199
##   E 0.15221402 0.84778598
##   F 0.33086289 0.66913711
##   M 0.09520240 0.90479760
##   O 0.08695652 0.91304348
##   P 0.32518337 0.67481663

0: occupations that does NOT require higher education 1: occupations that does require higher education

A: higher vocational trainings B: bachelor E: univerity (before Bologna) F: collage (before Bologna) M: master O: undivided training (e.g loyar, doctor) P: special teacher training

##2.4. Distribution of bachelor graduates working in an occupation that requires higher education degree (Fig1)

##                                              
##                                                       0         1
##   Agricultural Science                        0.3925620 0.6074380
##   Art Education                               0.3200000 0.6800000
##   Arts                                        0.2291667 0.7708333
##   Arts and Humanities                         0.4260204 0.5739796
##   Computer Science and Information Technology 0.1082474 0.8917526
##   Economic Science                            0.3412810 0.6587190
##   Engineering Science                         0.1844106 0.8155894
##   Health Science                              0.1239193 0.8760807
##   Law                                         0.3333333 0.6666667
##   Military sciences                           0.2058824 0.7941176
##   Natural sciences                            0.4038462 0.5961538
##   Religious Science                           0.2500000 0.7500000
##   Social Science                              0.3614035 0.6385965
##   Sport Science                               0.4666667 0.5333333
##   Teacher Training                            0.3250774 0.6749226

##2.5. Distribution of graduates that work on occupation which requiring higher education degree by counties in Hungary (Fig2)

##                              
##                                  0    1
##   HU101                        829 1748
##   HU102                        165  287
##   HU211                         50  108
##   HU212                         55   64
##   HU213                         52   81
##   HU221                         95  169
##   HU222                         34   68
##   HU223                         29   57
##   HU231                         29   50
##   HU232                         21   44
##   HU233                         16   39
##   HU311                         73  154
##   HU312                         35   56
##   HU313                         15   23
##   HU321                         98  202
##   HU322                         39  110
##   HU323                         72  148
##   HU331                         49  113
##   HU332                         65  135
##   HU333                         86  175
##   not NUTS important taxpayer  379 1206
##   not NUTS special tax          26   53

##2.6. Gender pay gap (Fig.3, Fig.4)

Lollipop chart on wage differences by gender

## Using Mountly_wage_HUF as value column: use value.var to override.

## Using Mountly_wage_HUF as value column: use value.var to override.

##2.7. Strongest connections (Table4)

##                        Var1
## 1       Business management
## 2       Mechanical Engineer
## 3             IT  engineer 
## 4                  Tourism 
## 5         Electric engineer
## 6    Finance and Accounting
## 7  Nursing and Patient Care
## 8  Nursing and Patient Care
## 9      Software Engineering
## 10   Finance and Accounting
##                                                 Occupation value exp diff
## 1       Financial and mathematical associate professionals   153  49  104
## 2  Engineering professionals (excluding electrotechnology)   114  13  101
## 3        Software and applications developers and analysts   102   7   95
## 4                               Client information workers    83  16   67
## 5                              Electrotechnology engineers    65   3   62
## 6       Financial and mathematical associate professionals    80  18   62
## 7                               Other health professionals    55   2   53
## 8                                      Nurses and midwives    47   2   45
## 9        Software and applications developers and analysts    48   4   44
## 10                 Numerical and material recording clerks    49   7   42

##2.8. Weakest connections (Table5)

##                               Var1
## 1              Business management
## 2  Communication and Media Science
## 3              Business management
## 4                         Tourism 
## 5                Electric engineer
## 6              Business management
## 7                        Andragogy
## 8                Electric engineer
## 9           Finance and Accounting
## 10          Commerce and marketing
##                                                 Occupation value exp diff
## 1  Engineering professionals (excluding electrotechnology)    21  48  -27
## 2  Engineering professionals (excluding electrotechnology)     6  26  -20
## 3        Software and applications developers and analysts     9  28  -19
## 4  Engineering professionals (excluding electrotechnology)     1  20  -19
## 5                                 Business services agents     3  21  -18
## 6             Physical and engineering science technicians     9  26  -17
## 7  Engineering professionals (excluding electrotechnology)     4  20  -16
## 8       Financial and mathematical associate professionals     1  15  -14
## 9  Engineering professionals (excluding electrotechnology)     3  17  -14
## 10 Engineering professionals (excluding electrotechnology)     1  15  -14

#3. Analysis of degree distributions

##3.1. “Program set” of bipartite graph (Fig. 5)

The initial distribution of data can be seen on figure below.

###Initial estimation

This estimation calcualate \(K_{min}\), \(\gamma\) and makes CDF and determine minimum \(D\). With {poweRlaw} functions very easy to achive suggested steps 1-3.

## [1] 26
## [1] 1.930942
## [1] 0.09800509

###Scanning whole range

And the fitted power-law on CDF curve.

###Investigate goodness of fit

Initial estimated \(\gamma\) and \(K_{min}\) denoted with blue colour on the figure.

## [1] 0.074

The \(p\) value is 0.074. If \(p\) value more than 1% means that null hypothesis cannot be rejected maybe it is a power-law distribution.

###Fitting real distribution

This figure compares different distributions. (red: power-law, green: lognormal, blue: Poisson, magenta: exponential)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Compare lognormal and powerlaw distribution with Vuong’s test

## [1] 0.6281597

##3.2. “Occupation set” of bigraph (Fig. 6)

The initial distribution of data can be seen on figure below.

It is very similar to power-law distribution but let it see with {powerLaw}.

###Initial estimation

This estimation calcualate \(K_{min}\), \(\gamma\) and makes CDF and determine minimum \(D\). With {poweRlaw} functions very easy to achive suggested steps 1-3.

## [1] 63
## [1] 2.177573
## [1] 0.1159359

###Scanning whole range

And the fitted power-law on CDF curve.

###Investigate goodness of fit

Initial estimated \(\gamma\) and \(K_{min}\) denoted with blue colour on the figure.

## [1] 0.1

The \(p\) value is 0.1. If \(p\) value more than 1% means that null hypothesis cannot be rejected maybe it is a power-law distribution.

###Fitting real distribution

This figure compares different distributions. (red: power-law, green: lognormal, blue: Poisson, magenta: exponential)

Compare lognormal and powerlaw distribution with Vuong’s test

## [1] 0.2749639

#4. Node measures

##4.1. Centralities

## Warning: `evcent()` was deprecated in igraph 2.0.0.
## ℹ Please use `eigen_centrality()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: `list.vertex.attributes()` was deprecated in igraph 2.0.0.
## ℹ Please use `vertex_attr_names()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: `get.adjlist()` was deprecated in igraph 2.0.0.
## ℹ Please use `as_adj_list()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: `bipartite.projection()` was deprecated in igraph 2.0.0.
## ℹ Please use `bipartite_projection()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: `get.edgelist()` was deprecated in igraph 2.0.0.
## ℹ Please use `as_edgelist()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: `page.rank()` was deprecated in igraph 2.0.0.
## ℹ Please use `page_rank()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
##                             nodes occ degree.weighted degree betweenness.w
## 1        Nursing and Patient Care   0             192     29       319.692
## 2                       Andragogy   0             367     65      1951.548
## 3             Business management   0             874     71      3714.669
## 4               Electric engineer   0             258     31       812.092
## 5                        Tourism    0             364     55      1079.122
## 6 Communication and Media Science   0             469     55      1705.610
##   betweenness eigen.centr.w eigen.centr power.centr.w power.centr
## 1    205.2932    0.03406125   0.4782548    -0.3633216 -0.47515436
## 2   1270.4400    0.30107024   0.8684517    -0.3696028 -0.73715677
## 3   1480.1857    1.00000000   0.9251576    -0.5914092 -0.45960988
## 4    580.4062    0.09377294   0.4275998    -0.4443834  0.45832985
## 5    815.1195    0.30629223   0.7842568    -0.6436639 -0.07243149
## 6    876.7196    0.37082755   0.7415642    -0.5505395  0.42883356
##   transitivity.w transitivity closeness.w   closeness pagerank.w   pagerank
## 1       2.816069    0.2221909 0.001926782 0.001926782 0.01111493 0.01111493
## 2      10.818187    0.2151061 0.002331002 0.002331002 0.02176081 0.02176081
## 3      23.980222    0.2159743 0.002386635 0.002386635 0.04571265 0.04571265
## 4       8.903414    0.1923984 0.001956947 0.001956947 0.01578785 0.01578785
## 5      11.200653    0.2187802 0.002227171 0.002227171 0.02070522 0.02070522
## 6      13.757914    0.2032963 0.002207506 0.002207506 0.02662744 0.02662744
##      knn.w
## 1 101.9426
## 2 102.4841
## 3 102.2990
## 4 100.2730
## 5 102.8040
## 6 102.7469

##4.2. Distance distribution

## [1] 2.552741

fitted equation: function(d) \(k*exp(-1/2*(d-mu)^2/sigma^2)\)

k = 0.1904586 mu = 2.5388282 sigma = 0.8250279

The variance of distances is sigma=0.8250279, indicating that most path legths are in a close vicinity of mean d. These are manifestations of the small world property. [Barabasi: Network science, 2015]

##4.3. Farthest nodes

## [1] "831 - Architecture "
## [1] "752 - Architecture "

#5. Clustering methods

##5.1. Create purified graph Purified with configuration model (alpha=0.5)

Number of connection in original graph and purified graph: * before: L = 7402 * after: L_pur = 7148

##5.2. Louvain (Fig. 8)

###Table 6.

##       1     2     3     4     5
## 5 0.988 0.889 0.297 0.422 2.061
## 4 0.533 0.735 0.293 3.971 0.739
## 3 0.470 0.602 3.199 0.214 0.355
## 2 0.637 3.810 0.589 0.650 0.871
## 1 1.738 0.658 0.409 0.439 0.948

###Germanistics An expample

## 
##                                        Administrative and specialised secretaries 
##                                                                        0.02083333 
##                                                Authors; journalists and linguists 
##                                                                        0.04166667 
##                                                          Business services agents 
##                                                                        0.08333333 
##                                                        Client information workers 
##                                                                        0.06250000 
##                                                Database and network professionals 
##                                                                        0.10416667 
##                           Engineering professionals (excluding electrotechnology) 
##                                                                        0.04166667 
##                                Financial and mathematical associate professionals 
##                                                                        0.02083333 
##                                                             General office clerks 
##                                                                        0.04166667 
## Information and communications technology operations and user support technicians 
##                                                                        0.08333333 
##                                                               Legal professionals 
##                                                                        0.04166667 
##                                                 Machinery mechanics and repairers 
##                                                                        0.02083333 
##                                     Mining and mineral processing plant operators 
##                                                                        0.02083333 
##                                                    Other clerical support workers 
##                                                                        0.12500000 
##                                              Other health associate professionals 
##                                                                        0.02083333 
##                                                           Other services managers 
##                                                                        0.02083333 
##                                                      Other teaching professionals 
##                                                                        0.02083333 
##                                      Physical and engineering science technicians 
##                                                                        0.02083333 
##                                                    Professional services managers 
##                                                                        0.02083333 
##                                     Regulatory government associate professionals 
##                                                                        0.02083333 
##                                           Sales and purchasing agents and brokers 
##                                                                        0.04166667 
##                                                      Secondary education teachers 
##                                                                        0.06250000 
##                                                             Secretaries (general) 
##                                                                        0.02083333 
##                                                Social and religious professionals 
##                                                                        0.04166667

##5.3. BRIM - Barber algorithm (Fig. 9)

This algorithm has a bootstrapping step. That is why there is no two exactly same figure. But modules are the same. Seed was not set.

#6. Multi-resolution method

##6.1. Reamain L after purification (Fig. 7)

L <- rep(0, 101)

for (i in 0:100){
j <- i/10
DF.pur <- melt(A)
DF.pur <- DF.pur[DF.pur$value!=0,]
DF.pur$Var2 <- as.factor(DF.pur$Var2)

degree.tr <- data.frame(rownames(A),rowSums(A))
names(degree.tr) <- c("names", "degree.tr")
degree.oc <- data.frame(colnames(A),colSums(A))
names(degree.oc) <- c("names", "degree.oc")

DF.pur <- left_join(DF.pur, degree.tr, by=c("Var1"="names"))
DF.pur <- left_join(DF.pur, degree.oc, by=c("Var2"="names"))

DF.pur$exp <- round((DF.pur$degree.tr * DF.pur$degree.oc * j) / sum(A)) #(k_i*k_j)/(L)
DF.pur$diff <- DF.pur$value - DF.pur$exp
DF.pur$sparse <- ifelse(DF.pur$diff>=0, DF.pur$value, 0)
L[i+1] <- sum(DF.pur$sparse)
}

L.df <- data.frame(alpha=seq(0,10, by=0.1), L=L)
L.df$L <- L.df$L/7402
ggplot(L.df) + geom_point(aes(x=alpha, y=L)) + 
  ylab(expression(L[p]/L)) + xlab(expression(alpha)) + 
  scale_x_continuous(breaks=seq(0,10,by=1)) +
  theme_bw()

#ggsave("Fig_Lp_vs_alpha.pdf", height=4.5, width=7)

##6.2. Multi-resolution method with Louvain: training programs

###Scanning gamma (Fig. 10)

Q.var <- rep(0, 101)
clust <- matrix(0, nrow=223, ncol=101) #223 node, 101 gamma value
L <- rep(0, 101)
connections <- rep(0, 101)

for (i in 0:100){
j <- i/10
DF.pur <- melt(A)
DF.pur <- DF.pur[DF.pur$value!=0,]
DF.pur$Var2 <- as.factor(DF.pur$Var2)

degree.tr <- data.frame(rownames(A),rowSums(A))
names(degree.tr) <- c("names", "degree.tr")
degree.oc <- data.frame(colnames(A),colSums(A))
names(degree.oc) <- c("names", "degree.oc")

DF.pur <- left_join(DF.pur, degree.tr, by=c("Var1"="names"))
DF.pur <- left_join(DF.pur, degree.oc, by=c("Var2"="names"))

DF.pur$exp <- round((DF.pur$degree.tr * DF.pur$degree.oc * j) / sum(A)) #(k_i*k_j)/(L)
DF.pur$diff <- DF.pur$value - DF.pur$exp
DF.pur$sparse <- ifelse(DF.pur$diff>=0, DF.pur$value, 0)
L[i+1] <- sum(DF.pur$sparse)
connections[i+1] <- 2054-sum(DF.pur$sparse==0)

A.pur <- acast(DF.pur, Var1 ~ Var2, value.var = "sparse")
A.pur[is.na(A.pur)] <- 0

g.pur <- graph.incidence(A.pur, directed=F, weighted = T)

lou <- cluster_louvain(g.pur)

clust[,i+1] <- lou$membership
Q.var[i+1] <- modularity(lou)
}

rownames(clust) <- lou$names
colnames(clust) <- seq(0, 10, 0.1)

###Number of clusters

C <- sapply(1:ncol(clust), function(x) max(clust[,x]))
C <- data.frame(seq(0,10,0.1), C)
colnames(C)[1] <- "gamma"
#ggplot(C, aes(x=gamma, y=C)) + geom_point() + theme_bw()
ggplot(C[1:50,], aes(x=gamma, y=C)) + geom_point(size=1.2) + theme_bw() +
  annotate("segment", x = 1.8, xend = 1.2, y = 5, yend = 5, colour="blue", size=1, arrow=arrow()) + 
  annotate("segment", x = 4.2, xend = 3.6, y = 10, yend = 10, colour="blue", size=1, arrow=arrow()) +
  labs(y="Number of clusters", x="alpha")

#ggsave("cikkbe_Fig_C_vs_gamma.pdf", height = 14, width=16, units="cm")

###Number of weights

L <- data.frame(seq(0,10,0.1), L)
colnames(L)[1] <- "gamma"
ggplot(L, aes(x=gamma, y=L)) + geom_point() + theme_bw()

ggplot(L[1:70,], aes(x=gamma, y=L)) + geom_point() + theme_bw()

###Number of links

connections <- data.frame(seq(0,10,0.1), connections)
colnames(connections)[1] <- "gamma"
ggplot(connections, aes(x=gamma, y=connections)) + geom_point() + theme_bw()

ggplot(connections[1:70,], aes(x=gamma, y=connections)) + geom_point() + theme_bw()

###Similarity of nodes

S <- matrix(0, nrow=223, ncol=223)
rownames(S) <- rownames(clust)
colnames(S) <- rownames(clust)
res1 <- 1
res2 <- 11
for (i in 1:223){
  for (j in 1:223){
    S[i,j] <- sum(clust[i,res1:res2]-clust[j,res1:res2]==0)
    S[j,i] <- S[i,j]
  }
}

S.pr <- S[1:110, 1:110]
S.pr <- S.pr/(res2-res1+1)
longData<-melt(S.pr)
longData<-longData[longData$value!=0,]

#o <- seriate(S.pr, method = "BEA_TSP")
#longData$Var1 <- factor(longData$Var1, levels=names(unlist(o[[1]][])))
#longData$Var2 <- factor(longData$Var2, levels=names(unlist(o[[1]][])))

#cl <- data.frame(names(unlist(o[[1]][])))

ord <- read.csv("ordering.csv")

#this ordering was saved for repeatability

longData$Var1 <- factor(longData$Var1, levels=ord[,2])
longData$Var2 <- factor(longData$Var2, levels=ord[,2])

cl <- data.frame(ord[,2])

colnames(cl)[1] <- "programs"

clust2 <- data.frame(rownames(clust), clust[,1])
rownames(clust2) <- 1:223
colnames(clust2) <- c("programs", "clusters")

cl <- left_join(cl, clust2, by="programs")

#color palette for axis
axis.x.colour <- (cl[,2] %>% plyr::mapvalues(from=c(1:10), to=c("#B40404", "#e68a00", "#0B6121", "#000099", "#0099cc", "#00cc99", "#6600cc", "#e600e6", "#999900", "#7094db")))
## The following `from` values were not present in `x`: 7, 8, 9, 10
#ordering <- names(unlist(o[[1]][]))
#write.csv(ordering, "ordering.csv")
ggplot(longData, aes(x = Var2, y = Var1)) + 
  geom_raster(aes(fill=value)) + 
  scale_fill_gradient(low="grey60", high="red") +
  labs(x="", y="", title="") +
  theme_bw() + 
  theme(axis.text.x=element_text(size=5, angle=90, vjust=0.2, hjust=1),
                     axis.text.y=element_text(size=5),
                     legend.text=element_text(size=8),
                     legend.title=element_text(size=8)) +
  theme(axis.text.x=element_text(colour=axis.x.colour)) +
  theme(axis.text.y=element_text(colour=axis.x.colour)) +
  annotate("segment", x = 23.5, xend = 23.5, y = 0, yend = 110.5, colour = "blue", size = 0.5) +
  annotate("segment", x = 42.5, xend = 42.5, y = 0, yend = 110.5, colour = "blue", size = 0.5) +
  annotate("segment", x = 65.5, xend = 65.5, y = 0, yend = 110.5, colour = "blue", size = 0.5) +
  annotate("segment", x = 95.5, xend = 95.5, y = 0, yend = 110.5, colour = "blue", size = 0.5) +
  annotate("segment", x = 0, xend = 110.5, y = 23.5, yend = 23.5, colour = "blue", size = 0.5) +
  annotate("segment", x = 0, xend = 110.5, y = 42.5, yend = 42.5, colour = "blue", size = 0.5) +
  annotate("segment", x = 0, xend = 110.5, y = 65.5, yend = 65.5, colour = "blue", size = 0.5) +
  annotate("segment", x = 0, xend = 110.5, y = 95.5, yend = 95.5, colour = "blue", size = 0.5)
## Warning: Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.
## Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.

#ggsave("cikkbe_Fig_2.9-3.4.pdf", height = 22, width=23, units="cm")

###Programs in clusters table (Fig. 11)

table(clust[,30], clust[,1])
##     
##       1  2  3  4  5  6
##   1  19  0  2  0 11  0
##   2   4 19  0  0  6  0
##   3   3  0 41  0  0  3
##   4   0  0  0 10  2  0
##   5   6  0  4  0  2 32
##   6   4  0  4  0 12  1
##   7   6  0  0  0  0  0
##   8   0  0  7  0  0  0
##   9   3  0  2  0 11  0
##   10  0  0  0  9  0  0
table(clust[1:110,30], clust[1:110,1]) #programs
##     
##       1  2  3  4  5  6
##   1  12  0  1  0  6  0
##   2   1  7  0  0  5  0
##   3   1  0 22  0  0  1
##   4   0  0  0  4  1  0
##   5   2  0  2  0  1 15
##   6   3  0  3  0  6  1
##   7   3  0  0  0  0  0
##   8   0  0  4  0  0  0
##   9   1  0  0  0  4  0
##   10  0  0  0  4  0  0
table(clust[111:223,30], clust[111:223,1]) #occuptions
##     
##       1  2  3  4  5  6
##   1   7  0  1  0  5  0
##   2   3 12  0  0  1  0
##   3   2  0 19  0  0  2
##   4   0  0  0  6  1  0
##   5   4  0  2  0  1 17
##   6   1  0  1  0  6  0
##   7   3  0  0  0  0  0
##   8   0  0  3  0  0  0
##   9   2  0  2  0  7  0
##   10  0  0  0  5  0  0
clu <- data.frame(
  name=rownames(clust),
  gamma0=clust[,1],
  gamma2.9=clust[,30])

colnames(degree.oc) <- colnames(degree.tr)
degree <- rbind(degree.tr, degree.oc)

clu <- left_join(clu, degree, by=c("name"="names"))

clu15 <- clu[clu$degree.tr>15,]
ggplot(clu[clu$degree.tr>15,]) + geom_point(aes(x=gamma2.9, y=gamma0), color="red") + 
  geom_text_repel(aes(x=gamma2.9, y=gamma0, label=name), size=3) + 
  scale_x_discrete(limits=1:10) + 
  scale_y_discrete(limits=1:5) + 
  labs(x="alpha=2.9", y="alpha=0") + 
  theme_bw()
## Warning in scale_x_discrete(limits = 1:10): Continuous limits supplied to discrete scale.
## ℹ Did you mean `limits = factor(...)` or `scale_*_continuous()`?
## Warning in scale_y_discrete(limits = 1:5): Continuous limits supplied to discrete scale.
## ℹ Did you mean `limits = factor(...)` or `scale_*_continuous()`?
## Warning: ggrepel: 121 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

#ggsave("Fig_clusters.pdf", height = 22, width=42, units="cm")

##6.3. Multi-resolution method with Louvain: occupations

###Scanning gamma

Q.var <- rep(0, 101)
clust <- matrix(0, nrow=223, ncol=101) #223 node, 101 gamma value
L <- rep(0, 101)
connections <- rep(0, 101)

for (i in 0:100){
j <- i/10
DF.pur <- melt(A)
DF.pur <- DF.pur[DF.pur$value!=0,]
DF.pur$Var2 <- as.factor(DF.pur$Var2)

degree.tr <- data.frame(rownames(A),rowSums(A))
names(degree.tr) <- c("names", "degree.tr")
degree.oc <- data.frame(colnames(A),colSums(A))
names(degree.oc) <- c("names", "degree.oc")

DF.pur <- left_join(DF.pur, degree.tr, by=c("Var1"="names"))
DF.pur <- left_join(DF.pur, degree.oc, by=c("Var2"="names"))

DF.pur$exp <- round((DF.pur$degree.tr * DF.pur$degree.oc * j) / sum(A)) #(k_i*k_j)/(L)
DF.pur$diff <- DF.pur$value - DF.pur$exp
DF.pur$sparse <- ifelse(DF.pur$diff>=0, DF.pur$value, 0)
L[i+1] <- sum(DF.pur$sparse)
connections[i+1] <- 2054-sum(DF.pur$sparse==0)

A.pur <- acast(DF.pur, Var1 ~ Var2, value.var = "sparse")
A.pur[is.na(A.pur)] <- 0

g.pur <- graph.incidence(A.pur, directed=F, weighted = T)

lou <- cluster_louvain(g.pur)

clust[,i+1] <- lou$membership
Q.var[i+1] <- modularity(lou)
}

rownames(clust) <- lou$names
colnames(clust) <- seq(0, 10, 0.1)

###Similarity of nodes

S <- matrix(0, nrow=223, ncol=223)
rownames(S) <- rownames(clust)
colnames(S) <- rownames(clust)
res1 <- 30
res2 <- 35
for (i in 1:223){
  for (j in 1:223){
    S[i,j] <- sum(clust[i,res1:res2]-clust[j,res1:res2]==0)
    S[j,i] <- S[i,j]
  }
}

S.oc <- S[111:223, 111:223]
S.oc <- S.oc/(res2-res1+1)
longData<-melt(S.oc)
longData<-longData[longData$value!=0,]

#o <- seriate(S.oc, method = "BEA_TSP")
#longData$Var1 <- factor(longData$Var1, levels=names(unlist(o[[1]][])))
#longData$Var2 <- factor(longData$Var2, levels=names(unlist(o[[1]][])))

ord <- read.csv("ordering_occup.csv")

longData$Var1 <- factor(longData$Var1, levels=ord[,2])
longData$Var2 <- factor(longData$Var2, levels=ord[,2])

cl <- data.frame(names(unlist(ord[[1]][])))
cl <- data.frame(ord[,2])
colnames(cl)[1] <- "occupations"
cl$occupations <- as.character(cl$occupations)

clust2 <- data.frame(rownames(clust), clust[,30])
rownames(clust2) <- 1:223
colnames(clust2) <- c("programs", "clusters")

cl <- left_join(cl, clust2, by=c("occupations"="programs"))

#color palette for axis
axis.x.colour <- (cl[,2] %>% plyr::mapvalues(from=c(1:10), to=c("#B40404", "#e68a00", "#0B6121", "#000099", "#0099cc", "#00cc99", "#6600cc", "#e600e6", "#999900", "#7094db")))
## The following `from` values were not present in `x`: 10
#ordering <- names(unlist(o[[1]][]))
#write.csv(ordering, "ordering_occup.csv")
ggplot(longData, aes(x = Var2, y = Var1)) + 
  geom_raster(aes(fill=value)) + 
  scale_fill_gradient(low="grey60", high="red") +
  labs(x="", y="", title="") +
  theme_bw() + 
  theme(axis.text.x=element_text(size=5, angle=90, vjust=0.2, hjust=1),
                     axis.text.y=element_text(size=5),
                     legend.text=element_text(size=8),
                     legend.title=element_text(size=8)) +
  theme(axis.text.x=element_text(colour=axis.x.colour)) +
  theme(axis.text.y=element_text(colour=axis.x.colour)) +
  annotate("segment", x = 20.5, xend = 20.5, y = 0, yend = 113.5, colour = "blue", size = 0.5) +
  annotate("segment", x = 42.5, xend = 42.5, y = 0, yend = 113.5, colour = "blue", size = 0.5) +
  annotate("segment", x = 68.5, xend = 68.5, y = 0, yend = 113.5, colour = "blue", size = 0.5) +
  annotate("segment", x = 84.5, xend = 84.5, y = 0, yend = 113.5, colour = "blue", size = 0.5) +
  annotate("segment", x = 0, xend = 113.5, y = 20.5, yend = 20.5, colour = "blue", size = 0.5) +
  annotate("segment", x = 0, xend = 113.5, y = 42.5, yend = 42.5, colour = "blue", size = 0.5) +
  annotate("segment", x = 0, xend = 113.5, y = 68.5, yend = 68.5, colour = "blue", size = 0.5) +
  annotate("segment", x = 0, xend = 113.5, y = 84.5, yend = 84.5, colour = "blue", size = 0.5)
## Warning: Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.
## Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.

#ggsave("oc_Fig_2.9-3.4_v2.pdf", height = 22, width=23, units="cm")